﻿using System;
using System.Collections.Generic;
using System.Linq;
using System.Text;
using System.Net;
using System.Data;
using System.Text.RegularExpressions;
using System.Web.UI.WebControls;
using System.Configuration;
using System.Web.UI;
using System.Web;
using System.IO;


namespace 书籍与目录检索
{
    class GetData
    {
        /// <summary>
        /// 从指定网址中获取网页内容。
        /// </summary>
        /// <param name="url">网址</param>
        /// <returns></returns>
        public static string getHtml(string url)
        {
            try
            {
                WebClient myWebClient = new WebClient();

                myWebClient.Credentials = CredentialCache.DefaultCredentials;
                //获得数据
                byte[] myDataBuffer = myWebClient.DownloadData(url);
                string strWebData = Encoding.Default.GetString(myDataBuffer);

                //获取网页字符编码描述信息 
                Match charSetMatch = Regex.Match(strWebData, "<meta([^<]*)charset=([^<]*)\"", RegexOptions.IgnoreCase | RegexOptions.Multiline);
                string charSet = charSetMatch.Groups[2].Value;

                if (charSet != null && charSet != "" && Encoding.GetEncoding(charSet) != Encoding.Default)
                    strWebData = Encoding.GetEncoding(charSet).GetString(myDataBuffer);
                return strWebData;
            }
            catch (Exception ex)
            {
                return null;
                throw (new Exception(ex.Message));
             
            }
            
        }
        /// <summary>
        /// 包含参数的获取,如果有中文字符，必须先编码为与目标网页一致
        /// </summary>
        /// <param name="url">地址</param>
        /// <param name="param">参数</param>
        /// <param name="charSet">字符串格式</param>
        /// <param name="method">Post或者Get</param>
        /// <returns>网页内容</returns>
        public static string getHtml(string url, string param, string charSet, string method)
        {
            //默认为POST方法,utf-8编码
            try
            {
                if (method == "")
                    method = "POST";
                if (charSet == "")
                    charSet = "uft-8";

                WebClient myWebClient = new WebClient();
                //构建参数的比特串
                byte[] postData = Encoding.ASCII.GetBytes(param);
                //设置头
                myWebClient.Headers.Add("Content-Type", "application/x-www-form-urlencoded");
                myWebClient.Headers.Add("ContentLength", postData.Length.ToString());
                //提交
                byte[] myDataBuffer = myWebClient.UploadData(url, method, postData);
                //解码
                return Encoding.GetEncoding(charSet).GetString(myDataBuffer);
            }
            catch (Exception ex)
            {
                throw(new Exception("getHtml中发现异常"+ex.Message));
              
            }

        }
        /// <summary>
        /// 获得所有可用站点的基本信息
        /// </summary>
        /// <returns></returns>
        public static SiteInfo[] getAllSiteInfo(string sConnStr)
        {
            //可用站点数量
            int iSiteCount;
            //站点信息数组
            SqlDataSource ds = new SqlDataSource();

            ds.ConnectionString = sConnStr;
            ds.SelectCommand = "select * from ls_Site where Site_Active=1";
            DataView dv = (DataView)ds.Select(DataSourceSelectArguments.Empty);
            iSiteCount = dv.Count;

            SiteInfo[] siteinfos = new SiteInfo[iSiteCount];
            for (int i = 0; i < iSiteCount; i++)
            {
                siteinfos[i] = new SiteInfo();
                siteinfos[i].SiteID = dv[i]["Site_ID"].ToString();
                siteinfos[i].SiteName = dv[i]["Site_Name"].ToString();
                siteinfos[i].SiteEncode = dv[i]["Site_Encode"].ToString();
                siteinfos[i].SiteMethod = dv[i]["Site_Method"].ToString();
                siteinfos[i].SearchURL = dv[i]["Site_SearchURL"].ToString();
                siteinfos[i].SearchParam = dv[i]["Site_SearchParam"].ToString();
                siteinfos[i].ResultReg = dv[i]["Site_ResultReg"].ToString();
                siteinfos[i].ResultOrder = dv[i]["Site_ResultOrder"].ToString();
                siteinfos[i].ListFormat = dv[i]["Site_ListFormat"].ToString();
                siteinfos[i].SiteRedirect = dv[i]["Site_Redirect"].ToString();
                siteinfos[i].ResultReg2 = dv[i]["Site_ResultReg2"].ToString();
                siteinfos[i].ResultOrder2 = dv[i]["Site_ResultOrder2"].ToString();
                siteinfos[i].ListFormat2 = dv[i]["Site_ListFormat2"].ToString();
            }
            ds.Dispose();
            dv.Dispose();
            return siteinfos;
        }
        
        /// <summary>
        /// 清理HTML源代码，去掉一些乱七八糟的格式
        /// </summary>
        /// <param name="sHtml">HTML源代码字符串</param>
        /// <param name="sRegContentExcept">自定义的删除内容</param>
        /// <returns>清理后的字符串</returns>
        public static string clearHTML(string sHtml, string sRegContentExcept)
        {

            //需要去掉的内容
            try
            {
                if (sRegContentExcept != "")
                    sHtml = Regex.Replace(sHtml, sRegContentExcept, "", RegexOptions.IgnoreCase | RegexOptions.Multiline);

                sHtml = Regex.Replace(sHtml, "<[ /]*CENTER[^<>]*>", "", RegexOptions.IgnoreCase | RegexOptions.Multiline);
                sHtml = Regex.Replace(sHtml, "<[ /]*UL[^<>]*>", "", RegexOptions.IgnoreCase | RegexOptions.Multiline);
                sHtml = Regex.Replace(sHtml, "<[ /]*LI[^<>]*>", "", RegexOptions.IgnoreCase | RegexOptions.Multiline);
                sHtml = Regex.Replace(sHtml, "<[ /]*SPAN[^<>]*>", "", RegexOptions.IgnoreCase | RegexOptions.Multiline);
                sHtml = Regex.Replace(sHtml, "<[ ]*HR[^<>]*>", "", RegexOptions.IgnoreCase | RegexOptions.Multiline);
                sHtml = Regex.Replace(sHtml, "<!--[^\\f]*?-->", "", RegexOptions.IgnoreCase | RegexOptions.Multiline);
                sHtml = Regex.Replace(sHtml, "<[ /]*TABLE[^<>]*>", "", RegexOptions.IgnoreCase | RegexOptions.Multiline);
                sHtml = Regex.Replace(sHtml, "<[ /]*FONT[^<>]*>", "", RegexOptions.IgnoreCase | RegexOptions.Multiline);
                sHtml = Regex.Replace(sHtml, "<[ /]*TR[^<>]*>", "", RegexOptions.IgnoreCase | RegexOptions.Multiline);
                sHtml = Regex.Replace(sHtml, "<[ /]*TD[^<>]*>", "", RegexOptions.IgnoreCase | RegexOptions.Multiline);
                sHtml = Regex.Replace(sHtml, "<[ /]*BR[^<>]*>", "<P>", RegexOptions.IgnoreCase | RegexOptions.Multiline);
                sHtml = Regex.Replace(sHtml, "<[ /]*STRONG>", "<P>", RegexOptions.IgnoreCase | RegexOptions.Multiline);
                sHtml = Regex.Replace(sHtml, "&[a-z0-9#]{3,5};", "", RegexOptions.IgnoreCase | RegexOptions.Multiline);
                sHtml = Regex.Replace(sHtml, "<[^<>]*DIV[^<>]*>", "<P>", RegexOptions.IgnoreCase | RegexOptions.Multiline);
                sHtml = Regex.Replace(sHtml, "<[^<>]*SCRIPT[^<>]*>[^<>]*<[^<>]*/SCRIPT[^<>]*>", "", RegexOptions.IgnoreCase | RegexOptions.Multiline);
                sHtml = Regex.Replace(sHtml, "<[^<>]*FIELDSET[^<>]*>[^\v]*<[^<>]*/FIELDSET[^<>]*>", "", RegexOptions.IgnoreCase | RegexOptions.Multiline);
                sHtml = Regex.Replace(sHtml, "<[^<>]*IFRAM[^<>]*>[^<>]*<[^<>]*/IFRAM[^<>]*>", "", RegexOptions.IgnoreCase | RegexOptions.Multiline);
                sHtml = Regex.Replace(sHtml, "<[^<>]*SCRIPT[^<>]*>", "", RegexOptions.IgnoreCase | RegexOptions.Multiline);//去掉还没有配对的script
                sHtml = Regex.Replace(sHtml, "<[^<>]*A[^<>]*>[^<>]*<[^<>]*/A[^<>]*>", "", RegexOptions.IgnoreCase | RegexOptions.Multiline);//所有的<A></A>
                sHtml = Regex.Replace(sHtml, "<[ /]*A[^<>]*>", "<P>", RegexOptions.IgnoreCase | RegexOptions.Multiline);
                sHtml = Regex.Replace(sHtml, "<[ /]*P[^<>]*>", "<P>", RegexOptions.IgnoreCase | RegexOptions.Multiline);//不同的段落格式装换为一种
                sHtml = Regex.Replace(sHtml, "[\\f\\n\\t\\r\\v]*", "", RegexOptions.IgnoreCase | RegexOptions.Multiline);//去掉所有空白符号
                sHtml = Regex.Replace(sHtml, "<P>[<P> ]*<P>", "<P>", RegexOptions.IgnoreCase | RegexOptions.Multiline);//多个连续的仅放一个
                return sHtml.Trim();
            }
            catch (Exception ex)
            {
                Exception my = new Exception("在clearHTML中出现异常:" + ex.Data+ex.Message);
                throw (my);
            }
        }

        /// <summary>
        /// 在指定网站检索包含指定的书籍
        /// </summary>
        /// <param name="book">书籍信息</param>
        /// <param name="siteinfo">站点信息</param>
        /// <returns></returns>
        public static string findBookInSite(Book book, SiteInfo siteinfo,string sConnStr)
        {

                string sBookName;
                string sWriter;
                string sListURL;

                string sReturnMessage="";
                //第一次匹配是否成功
                bool bFirstMath = false;
                string sKeyWordEncode = HttpUtility.UrlEncode(book.BookName, Encoding.GetEncoding(siteinfo.SiteEncode));
                //替换参数。
                string sParam = siteinfo.SearchParam;
                sParam = sParam.Replace("<#KEYWORD>", sKeyWordEncode);
                //获取数据
                string sHtml;
                try
                {
                    sHtml = GetData.getHtml(siteinfo.SearchURL, sParam, siteinfo.SiteEncode, siteinfo.SiteMethod);
                    if (sHtml == null) //如果在查找的时候出现异常。
                        return "";
                }
                catch (Exception ex)
                {
                    Exception my = new Exception("在" + siteinfo.SiteName + "中检索" + book.BookName + "时出现异常:" + ex.Message);
                    return "";
                    throw (my);

                }
                //分析数据
                try
                {
                    Match match = Regex.Match(sHtml, siteinfo.ResultReg, RegexOptions.IgnoreCase | RegexOptions.Multiline);

                    bFirstMath = match.Success;
                    while (match.Success)
                    {
                        string[] sResultOrder = siteinfo.ResultOrder.Split(',');
                        sBookName = match.Groups[Convert.ToInt32(sResultOrder[0])].Value;
                        sWriter = match.Groups[Convert.ToInt32(sResultOrder[1])].Value;
                        sListURL = match.Groups[Convert.ToInt32(sResultOrder[2])].Value;


                        //去掉前后空格
                        sBookName = GetData.clearHTML(sBookName, "").Trim();
                        sWriter = GetData.clearHTML(sWriter, "");
                        sListURL = GetData.clearHTML(sListURL, "");
                        sListURL = siteinfo.ListFormat.Replace("<#LIST>", sListURL);

                        match = match.NextMatch();
                        if (sBookName != book.BookName || sWriter != book.BookWriter)
                            continue;
                        else
                        {
                            if (GetData.addBookList(book.BookID, siteinfo, sListURL, sConnStr) == 1)
                                sReturnMessage += siteinfo.SiteName + "：" + sListURL + "\r\n";
                        }

                    }

                    //如果没有用第一种正则表达式没有检索到结果，并且网站在单一结果时自动转向的，则调用第二个再次检索。
                    if (bFirstMath == false && siteinfo.SiteRedirect == "1")
                    {
                        match = Regex.Match(sHtml, siteinfo.ResultReg2, RegexOptions.IgnoreCase | RegexOptions.Multiline);
                        while (match.Success)
                        {
                            string[] sResultOrder2 = siteinfo.ResultOrder2.Split(',');
                            sBookName = match.Groups[Convert.ToInt32(sResultOrder2[0])].Value;
                            sWriter = match.Groups[Convert.ToInt32(sResultOrder2[1])].Value;
                            sListURL = match.Groups[Convert.ToInt32(sResultOrder2[2])].Value;


                            //去掉前后空格
                            sBookName = GetData.clearHTML(sBookName, "");
                            sWriter = GetData.clearHTML(sWriter, "");
                            sListURL = GetData.clearHTML(sListURL, "");
                            sListURL = siteinfo.ListFormat2.Replace("<#LIST>", sListURL);

                            match = match.NextMatch();
                            if (sBookName != book.BookName)
                                continue;
                            else
                                if (GetData.addBookList(book.BookID, siteinfo, sListURL, sConnStr) == 1)
                                    sReturnMessage += siteinfo.SiteName + "：" + sListURL + "\r\n";
                        }
                    }

                    return sReturnMessage;
                }
                catch (Exception ex)
                {
                    throw (new Exception("在分析" + siteinfo.SiteName + "检索" + book.BookName + "的结果时候出现异常:" + ex.Message));
                }
        }

        /// <summary>
        /// 插入书籍目录
        /// </summary>
        /// <param name="sBookID">书号</param>
        /// <param name="siteinfo">书籍信息</param>
        /// <param name="sListURL">目录地址</param>
        /// <returns>插入条数</returns>
        public static int addBookList(string sBookID, SiteInfo siteinfo, string sListURL,string sConnStr)
        {
            try
            {
                SqlDataSource ds = new SqlDataSource();
                int iRowEffect = 0;
                ds.ConnectionString = sConnStr;
                ds.InsertCommand = "insert into ls_List(List_BookName,List_SiteName,List_BookID,List_SiteID,List_URL,List_Reg) ";
                ds.InsertCommand += "select Book_Name,Site_Name,Book_ID,Site_ID,@list,Site_ListReg from ls_Book,ls_Site  ";
                ds.InsertCommand += "where  Book_ID=@bookid and Site_ID=@siteid and not exists (select * from ls_List where List_URL=@list)";
                ds.InsertParameters.Add("siteid", siteinfo.SiteID);
                ds.InsertParameters.Add("list", sListURL);
                ds.InsertParameters.Add("bookid", sBookID);
                iRowEffect = ds.Insert();
                ds.Dispose();
                return iRowEffect;
            }
            catch (Exception ex)
            {
                throw (new Exception("在插入书籍目录"+sListURL+"时候出现异常:" + ex.Message));
            }
        }

        /// <summary>
        /// 获取地址的前面部分。
        /// </summary>
        /// <param name="url"></param>
        /// <returns></returns>

        public static string getPathFromURL(string url)
        {
            try
            {

                url = url.Substring(0, url.LastIndexOf("/") + 1);
                return url;
            }
            catch (Exception ex)
            {
                throw (new Exception("地址转换 getPathFromURL异常:" + ex.Message));
            }
        }

        /// <summary>
        /// 获得List的状态
        /// </summary>
        /// <param name="sConnStr"></param>
        /// <returns></returns>
        public static string getListStatus(string sConnStr)
        {
            try
            {
                SqlDataSource ds = new SqlDataSource();
                ds.ConnectionString = sConnStr;
                ds.SelectCommand = "select count(*) from ls_List where List_Level=5 ";
                DataView dv = (DataView)ds.Select(DataSourceSelectArguments.Empty);
                string hot = dv[0][0].ToString();

                ds.SelectCommand = "select count(*) from ls_List where List_Level=1 ";
                dv = (DataView)ds.Select(DataSourceSelectArguments.Empty);
                string cold = dv[0][0].ToString();

                ds.SelectCommand = "select count(*) from ls_List where exists (select * from ls_MyBook where MyBook_BookID=List_BookID) ";
                dv = (DataView)ds.Select(DataSourceSelectArguments.Empty);
                string bookshelf = dv[0][0].ToString();

                ds.SelectCommand = "select count(*) from ls_List where List_Active=0 ";
                dv = (DataView)ds.Select(DataSourceSelectArguments.Empty);
                string failed = dv[0][0].ToString();

                ds.Dispose();
                dv.Dispose();
                return "热门：" + hot + "，完本：" + cold + "，书架：" + bookshelf + "，失败：" + failed;
            }
            catch (Exception ex)
            {
                throw (new Exception("获得当前目录状态getListStatus 发生异常:" + ex.Message+ex.Data));
            }
        }
        /// <summary>
        /// 建立书籍介绍页
        /// </summary>
        /// <param name="sBookID">书籍ID</param>
        /// <param name="iInterval">时间间隔</param>
        /// <returns>放回书籍页地址</returns>
        public static void buildBookPage(string sBookID, string sSitePath,string sConnStr)
        {
            string sTemplateMap = sSitePath+"/model/BookInfoTemplate.htm";
            
            string sDirMap = sSitePath+"/book/" + (Convert.ToInt32(sBookID) / 1000).ToString() + "/";

            string sPageMap = sDirMap + sBookID + ".htm";
      

            //页面存在标记
            bool bPageExists = false;
            if (File.Exists(sPageMap))
            {
               bPageExists = true;
            }

            if (!bPageExists)
            {
                if (!Directory.Exists(sDirMap))
                    Directory.CreateDirectory(sDirMap);
                StreamReader sr = new StreamReader(sTemplateMap);
                string sModel = sr.ReadToEnd();
                sr.Close();
                //替换

                Book book = getBookInfo(sBookID,sConnStr);

                sModel = sModel.Replace("<#BOOKID>", book.BookID);
                sModel = sModel.Replace("<#BOOKNAME>", book.BookName);
                sModel = sModel.Replace("<#BOOKWRITER>", book.BookWriter);
                sModel = sModel.Replace("<#WEEKVISIT>", book.WeekVisit);
                sModel = sModel.Replace("<#TOTALVISIT>", book.TotalVisit);
                sModel = sModel.Replace("<#TOTALRMD>", book.TotalRMD);
                sModel = sModel.Replace("<#BOOKREM>", book.BookRem);
                sModel = sModel.Replace("<#BOOKSTATE>", book.BookState);
                sModel = sModel.Replace("<#SLIST>", book.BookList);
                sModel = sModel.Replace("<#BOOKCOVER>", book.BookCover);
                sModel = sModel.Replace("<#BOOKTYPE>", book.BookType);
                sModel = sModel.Replace("<#USERNAME>", book.UserName);
                sModel = sModel.Replace("<#ADDTIME>", book.AddTime);
                sModel = sModel.Replace("<#TYPESELECT>",getTypeSelect(book.BookTypeID,sConnStr));

                //保存
                StreamWriter sw = new StreamWriter(sPageMap, false, Encoding.UTF8);
                sw.WriteLine(sModel);
                sw.Close();
            }
        }
        /// <summary>
        /// 获得书籍信息
        /// </summary>
        /// <param name="sBookID">书籍ID</param>
        /// <returns>书籍信息类</returns>
        public static Book getBookInfo(string sBookID,string sConnStr)
        {
            Book book = new Book();

            SqlDataSource ds = new SqlDataSource();
            ds.ConnectionString = sConnStr;
            ds.SelectCommand = "select * from ls_Book inner join ls_BookType on Book_Type=BookType_ID where Book_ID=@bookid";
            ds.SelectParameters.Add("bookid", sBookID);
            DataView dv = (DataView)ds.Select(DataSourceSelectArguments.Empty);
            if (dv != null && dv.Count > 0)
            {
                book.BookID = dv[0]["Book_ID"].ToString();
                book.BookName = dv[0]["Book_Name"].ToString();
                book.BookWriter = dv[0]["Book_Writer"].ToString();
                book.BookRem = dv[0]["Book_Rem"].ToString();
                book.BookCover = dv[0]["Book_Cover"].ToString();
                book.WeekVisit = dv[0]["Book_WeekVisit"].ToString();
                book.TotalVisit = dv[0]["Book_TotalVisit"].ToString();
                book.BookType = dv[0]["BookType_Name"].ToString();
                book.AddTime = dv[0]["Book_AddTime"].ToString();
                book.UserName = dv[0]["Book_UserName"].ToString();
                book.BookTypeID = dv[0]["Book_Type"].ToString();
                book.TotalRMD = dv[0]["Book_TotalTicket"].ToString();
                if (dv[0]["Book_Active"].ToString() == "False")
                    book.BookState = "已完成";
                else
                    book.BookState = "连载中";
                book.BookList = getList(sBookID,sConnStr);
            }
            ds.Dispose();
            dv.Dispose();
            return book;
        }
        /// <summary>
        /// 获得某书籍的目录列表
        /// </summary>
        /// <param name="sBookID">书籍编号</param>
        /// <returns>目录列表</returns>
        public static string getList(string sBookID,string sConnStr)
        {
            //slist('文学家','长生界','http://read.wenxuejia.net/files/article/html/2/2175/index.html','Kiee','97%');
            string sReturn = "<script language=\"javascript\" type=\"text/javascript\">\r\n";
            SqlDataSource ds = new SqlDataSource();
            ds.ConnectionString = sConnStr;
            ds.SelectCommand = "select * from ls_List where List_BookID=@bookid order by List_FailCount+List_RegFailCount";
            ds.SelectParameters.Add("bookid", sBookID);
            DataView dv = (DataView)ds.Select(DataSourceSelectArguments.Empty);
            if (dv != null && dv.Count > 0)
            {
                for (int i = 0; i < dv.Count; i++)
                {
                    string sHealth = (Convert.ToString(10000 / (100 + (int)dv[i]["List_RegFailCount"] + (int)dv[i]["List_FailCount"]))) + "%";
                    sReturn += "sList('" + dv[i]["List_SiteName"].ToString() + "','" + dv[i]["List_BookName"].ToString() + "','" + dv[i]["List_URL"].ToString() + "','" + dv[i]["List_UserName"].ToString() + "','" + sHealth + "');\r\n";
                }
            }
            sReturn += "</script>\r\n";
            ds.Dispose();
            dv.Dispose();
            return sReturn;
        }
        /// <summary>
        /// 获得书籍类型的下拉菜单
        /// </summary>
        /// <param name="booktype">书籍类型ID</param>
        /// <returns></returns>
        public static string getTypeSelect(string booktype,string sConnStr)
        {
            string sReturn = "<select id='select_booktype'>\r\n";
            SqlDataSource ds = new SqlDataSource();
            ds.ConnectionString = sConnStr;
            ds.SelectCommand = "select distinct BookType_ID,BookType_Name from ls_BookType  order by BookType_ID";
            DataView dv = (DataView)ds.Select(DataSourceSelectArguments.Empty);
            if (dv != null && dv.Count > 0)
            {
                for (int i = 0; i < dv.Count; i++)
                {
                    if (booktype == dv[i]["BookType_ID"].ToString())
                        sReturn += "<option value='" + dv[i]["BookType_ID"].ToString() + "' selected='selected'>" + dv[i]["BookType_Name"].ToString() + "</option>\r\n";
                    else
                        sReturn += "<option value='" + dv[i]["BookType_ID"].ToString() + "'>" + dv[i]["BookType_Name"].ToString() + "</option>\r\n";
                }
            }
            sReturn += "</select>\r\n";
            ds.Dispose();
            dv.Dispose();
            return sReturn;
        }
    }
}
